# This file is part of the replication packet for "A Low-Cost Information Nudge Increases Citizenship Application Rates Among Low-Income Immigrants"

# this file recreates the sample balance table in the paper that compares the NNY sample to the fee waiver population in the US, New York State, and New York City

# Input
#     acs2017Prepped.csv - a CSV file created by the file acs_prep that flags the fee waiver eligible population
#     NNYFeeWaiverReplicationData.dta - a DTA file of the replicaiton data for the NNY paper
# Output
#     sampleComparisons.tex - a tex file that contains the comparison table



# loading in the packages required ----------------------------------------


library(dplyr)
library(foreign)
library(statar)
library(xtable)


# setting the data location -----------------------------------------------

### NEED TO CHANGE DATA LOCATIONS

# set the data location
data_location <- ""
output_location <- ""


# loading in the data files -----------------------------------------------

# load the ACS data
setwd(data_location)
acs2017 <- read.csv("acs2017Prepped.csv", stringsAsFactors = FALSE)
# reformatting the names to merge with the NNY file
names(acs2017) <- tolower(names(acs2017))
dim(acs2017)

# load the NNY data
setwd(data_location)
nny2017_data <- readstata13::read.dta13("NNYFeeWaiverReplicationData.dta")
head(nny2017_data)

# some NNY variables were coarsened to protect participant information, these are set to NA for these calculations
nny2017_data$hhinc <-NA
nny2017_data$age <- NA
nny2017_data$hhsize <- NA


# comparing the two data sets and cretaing some variables to match each data set ---------------------------------------------

# keeping fee waiver eligible in acs2017
acs2017 <- filter(acs2017, fee_waiver_eligible == 1)
dim(acs2017)
acs2017 %>% tab(ny_state, nyc)
acs2017 %>% tab(nyc)

# combining the datasets
# age
acs2017$age <- acs2017$agep
summary(acs2017$age)
summary(nny2017_data$age)

# gender
acs2017 %>% tab(sex)
nny2017_data %>% tab(gender_f)
acs2017$gender_f <- 0
acs2017$gender_f[acs2017$sex == 2] <- 1

# married
acs2017 %>% tab(mar)
acs2017$marital_married <- 0
acs2017$marital_married[acs2017$mar == 1] <- 1
acs2017 %>% tab(marital_married)
nny2017_data %>% tab(marital_married)

# mtb - means tested benefits (SNAP, TANF, SSI, Medicaid)
nny2017_data %>% tab(mtbany)
acs2017$mtbany <- acs2017$meanstested
acs2017 %>% tab(mtbany)

# income
summary(nny2017_data$hhinc)
# adjusting the scale for income
acs2017$hhinc = acs2017$hincp / 1000
nny2017_data$hhinc <- nny2017_data$hhinc

# for the paper's analysis, negative income values were set to zero, so following the same procedure
nny2017_data$hhinc[nny2017_data$hhinc < 0] <- 0
summary(acs2017$hhinc)
summary(nny2017_data$hhinc)
summary(acs2017)

# hhsize
summary(nny2017_data$hhsize)
acs2017$hhsize <- acs2017$np
summary(acs2017$hhsize)

# income per capita
summary(nny2017_data$hhinc)
summary(nny2017_data$hhsize)
summary(nny2017_data$hhinc_cap)
# creating income per capita by dividing income by household size
acs2017$hhinc_cap <- acs2017$hhinc / acs2017$hhsize
nny2017_data$hhinc_cap <- nny2017_data$hhinc / nny2017_data$hhsize
summary(acs2017$hhinc_cap)
nny2017_data %>% filter(hhinc > 30) %>% select(hhinc, hhinc_cap, hhsize) %>% arrange(hhinc)


#years in US
# yoep is the ACS variable for year of entry
summary(nny2017_data$yearsinus)
acs2017$yearsinus <- 2017 - acs2017$yoep
summary(nny2017_data$yearsinus)
summary(acs2017$yearsinus)

#Highest education
nny2017_data %>% tab(educ)
acs2017 %>% tab(schl)

# less than high schoool
nny2017_data %>% tab(NoHSDiplom, educ)
acs2017$NoHSDiplom <- 0
acs2017$NoHSDiplom[acs2017$schl < 16] <- 1
summary(nny2017_data$NoHSDiplom)
summary(acs2017$NoHSDiplom)

# high school degree
nny2017_data %>% tab(educ_HS, educ)
acs2017$educ_HS <- 0
acs2017$educ_HS[acs2017$schl == 16 | acs2017$schl == 17] <- 1
summary(nny2017_data$educ_HS)
summary(acs2017$educ_HS)

# greater than high school
nny2017_data %>% tab(educ_BA, educ)
acs2017$educ_BA <- 0
acs2017$educ_BA[acs2017$schl > 19] <- 1
summary(nny2017_data$educ_BA)
summary(acs2017$educ_BA)


# country of origin

# Chinese
# pobp is the ACS varaible for place of birth. 207 is China
summary(nny2017_data$origin_China)
acs2017$origin_China <- 0
acs2017$origin_China[acs2017$pobp == 207] <- 1
summary(nny2017_data$origin_China)
summary(acs2017$origin_China)

# Mexican
# pobp is the ACS varaible for place of birth. 303 is Mexico
summary(nny2017_data$origin_Mexico)
acs2017$origin_Mexico <- 0
acs2017$origin_Mexico[acs2017$pobp == 303] <- 1
summary(nny2017_data$origin_Mexico)
summary(acs2017$origin_Mexico)

# Dominican
# pobp is the ACS varaible for place of birth. 329 is the Dominican Republic
summary(nny2017_data$origin_DR)
acs2017$origin_DR <- 0
acs2017$origin_DR[acs2017$pobp == 329] <- 1
summary(nny2017_data$origin_DR)
summary(acs2017$origin_DR)

# Ecuadorian
# pobp is the ACS varaible for place of birth. 365 is Ecuador
summary(nny2017_data$origin_Ecuador)
acs2017$origin_Ecuador <- 0
acs2017$origin_Ecuador[acs2017$pobp == 365] <- 1
summary(nny2017_data$origin_Ecuador)
summary(acs2017$origin_Ecuador)


# creating a dataframe for the group comparison ---------------------------

# creating the comparison groups
groups <- c("nny", "nyc", "ny_state", "usa")

# creating flags for which comparison group each observation belongs to
acs2017$acs <- 1
acs2017$nny <- 0

nny2017_data$nny <- 1
nny2017_data$acs <- 0
nny2017_data$nyc <- 0
nny2017_data$ny_state <- 0


# selecting variables that will be used in the comparison table
vars <- c("hhinc", "hhsize", "hhinc_cap", "age", "gender_f","origin_DR", "origin_China", "origin_Ecuador","origin_Mexico", 
          "marital_married",  "NoHSDiplom", "educ_HS", "educ_BA", "mtbany",
           "nyc", "ny_state", "acs", "nny")

# checking to make sure all of the varaibles are in each comparison group
vars[!(vars %in% names(acs2017))]
vars[!(vars %in% names(nny2017_data))]

# subsetting the dataframes to just the selected variables
acs2017 <- acs2017[,vars]
nny2017_data <- nny2017_data[,vars]

# combining the two data frames
df_combined <- rbind(acs2017, nny2017_data)

# creating an empty dataframe to contain the comparison values
# # of rows are the comparison variables, number of columns are the comparison groups
df_outcomes <- data.frame(matrix(nrow=length(vars), ncol = length(groups) + 1))
df_outcomes

# renaming the columns after the comparison groups, including a first column called VarName
names(df_outcomes) <- c("VarName", groups)
# inserting the variable names into the first column
df_outcomes$VarName <- vars


# creating a dataframe for each comparison group that will be used to calculate the means
nny <- data.frame(df_combined %>% filter(nny==1))
nny$hhsize <- NA
nyc <- data.frame(df_combined %>% filter(nyc == 1))
ny_state <-  data.frame(df_combined %>% filter(ny_state == 1))
usa <- data.frame(df_combined %>% filter(acs == 1))



# calculating the means for each group ------------------------------------


# for each row (which each is a comparison variable), calculate the mean of a column (columns are the variables in group dataframes), and put it in the correct cell
for (i in 1:nrow(df_outcomes)){
  df_outcomes[i,"nny"] <- mean(nny[,i])
}

for (i in 1:nrow(df_outcomes)){
  df_outcomes[i,"nyc"] <- mean(nyc[,i])
}

for (i in 1:nrow(df_outcomes)){
  df_outcomes[i,"ny_state"] <- mean(ny_state[,i])
}

for (i in 1:nrow(df_outcomes)){
  df_outcomes[i,"usa"] <- mean(usa[,i])
}


# formatting the output table ---------------------------------------------


# rounding the calculations for all but the label column
df_outcomes[,-1] <- round(df_outcomes[,-1], 2)
df_outcomes
# keeping only the comparison variables (not the group labels)
df_outcomes_output <- df_outcomes[1:14,]

# renaming the variable labels for output
df_outcomes_output[,1] <- c("HH Income (1,000s)", "Household Size", "HH Income P.Cap (1,000s)",
                     "Age", "Female", "Dominican Republic", "China", "Ecuador", "Mexico","Married",
                     "Less Than High School", "High School/GED degree", "BA Degree or Higher",
                     "Receives Means-Tested Benefits")
# renaming the columns
names(df_outcomes_output) <- c("", "NaturalizeNY", "New York City", "New York State", "United States")
df_outcomes_output

# creating the text to output with xtable
sampleComparisons <- xtable(df_outcomes_output, caption="Comparison of the Sample of Registrants", label = "sampleComparison")
comm <- paste0("\\hline \n \\multicolumn{5}{p{6.2in}}",
               "{\\scriptsize{\\emph{Note}: This table compares the sample in the experiment to the immigrant population that would be eligible for the fee waiver in New York City, New York
               State, and the United States. The comparison groups were created using data from the 2017 American Community Survey.}} \n")
sampleComparisons


# writing the tex file ----------------------------------------------------


setwd(output_location)
print(sampleComparisons, include.rownames=FALSE, caption.placement = "top", hline.after=c(-1,-1, 0),
      add.to.row = list(pos = list(14), command = comm), file = "sampleComparisons.tex")


